# Description ------------------------------------------------------------------
#   This script accumulates the individual zip-by-utility billing datasets.
#   - INPUT: zip-utility (.rds) billing data created by buildPanelPGE.R and
#     buildPanelSoCal.R. These files have consumption and price information.
#   - OUTPUT: a single dataset of consumption and prices for the cities/zip
#     codes served by both PG&E and SoCalGas.
#   - NEXT SCRIPT: Analysis scripts (e.g. analyzeCommonZipsStage1.R).

# Setup ------------------------------------------------------------------------
  # Options
  options(stringsAsFactors = FALSE)
  # Packages
  library(pacman)
  p_load(magrittr, data.table, stringr, lubridate, parallel)
  # Directories of joined bill-weather files
  dir_project <- "S:/Edward/Elasticity/"
  dir_csv     <- paste0(dir_project, "DataCsv/")
  dir_rds     <- paste0(dir_project, "DataR/")
  dir_pge     <- paste0(dir_rds, "/Prices/PGE/")
  dir_socal   <- paste0(dir_rds, "/Prices/SoCal/")

# Define desired variables -----------------------------------------------------
  # Define variables that I want without lags
  var_nolag <- c(
    "begin_date", "end_date", "year", "ym",
    "zip", "zip_plus4", "zip9",
    "care", "care_pct",
    "hh_id", "hh_month",
    "climate", "rate_sch", "utility",
    "allowance", "rev",
    NULL)
  # Define variables with which we want lags
  var_lags <- c(
    # "allowance",
    "thm", "thm_day",
    "days", "month",
    # paste0("t_p_", str_pad(1:14, 2, "left", 0)),
    "bill_hdd",
    # "east_year_hdd", "ca_year_hdd",
    # "east_year_cdd", "ca_year_cdd",
    # "east_year_dd", "ca_year_dd",
    "total_bill",
    "bill_flag",
    # "care",
    "price_avg", "price_avg_mrg",
    "price_base", "price_base_ppps",
    # "price_excess", "price_excess_ppps",
    # "price_base_init", "price_excess_init",
    # "ppps_charge",
    "price_mrg", "price_mrg_ppps",
    "mrg_sim_iv_10_14", "mrg_sim_iv_11_13",
    "spot_price_1week",
    "spot_price_2week",
    "spot_price_3week",
    # "spot_price_1week_init", "spot_price_2week_init", "spot_price_3week_init",
    NULL)
  # Define lags and leads
  the_lags <- c(1:5) %>% paste0("_lag", .)
  the_leads <- c(1:3) %>% paste0("_lead", .)
  # Defined all desired variables (including lags and leads)
  all_vars <- c(
    var_nolag, var_lags,
    paste0(rep(var_lags, each = length(the_lags)), the_lags),
    paste0(rep(var_lags, each = length(the_leads)), the_leads))

# Load the zip code neighbors file ---------------------------------------------
# NOTE: Created by 'buildZipCodeNeighborGroups.R'
  # Load the file
  group_dt <- readRDS(paste0(dir_rds, "zipCodeGroups.rds"))
  # Keep groups 0 (the zip codes common to both utilities) through 3
  group_dt <- group_dt[group <= 3, .(zip, group, pge, socal)]
  setorder(group_dt, "group", "zip")

# Find files and common zips ---------------------------------------------------
  # Find the file names for each utility
  pge_files <- dir_pge %>% dir()
  socal_files <- dir_socal %>% dir()
  # Find the zip codes for each utility
  pge_zips <- gsub("[^0-9]", "", pge_files)
  socal_zips <- gsub("[^0-9]", "", socal_files)

# Find common cities and their zips --------------------------------------------
  # Load the zip-city map and restrict to California; drop state
  zip_city <- fread(paste0(dir_csv, "zipCodeDatabase.csv"),
    select = c("zip", "state", "primary_city"))[state == "CA"][, state := NULL]
  # Change name of "primary_city" to "city"
  setnames(zip_city, "primary_city", "city")
  # Add features for whether the zip PGE and SoCal serve the zip
  zip_city[, `:=`(pge = F, socal = F)]
  zip_city[zip %in% pge_zips, pge := T]
  zip_city[zip %in% socal_zips, socal := T]
  # Collapse zip_city by city to find cities served by both utilities
  city_dt <- zip_city[, list(
    n_pge = sum(pge),
    n_socal = sum(socal)),
    by = "city"]
  # Find cities served by both utilities
  common_city_dt <- city_dt[n_pge > 0 & n_socal > 0]
  # Find the zip codes of the cities served by both utilities
  common_city_zips <- zip_city[city %in% common_city_dt$city]
  # Find the cities of the zip codes in group_dt
  zip_city[, zip := as.character(zip)]
  group_city_zips <- merge(x = group_dt, y = zip_city[,.(zip, city)],
    by = "zip", all.x = T, all.y = F, sort = F)
  # Save these data tables
  write.csv(x = common_city_zips,
    file = paste0(dir_csv, "commonCityZips.csv"),
    row.names = F)
  write.csv(x = group_city_zips,
    file = paste0(dir_csv, "groupCityZips.csv"),
    row.names = F)

# Iterate over 'neighbor' groups: create a dataset for each group --------------
  for (g in unique(group_dt[,group])) {

  # Create zip code lists for group g ------------------------------------------
    # PGE's zip codes in group g
    pge_g <- group_dt[group == g & pge == T, zip]
    # Socal's zip codes in group g
    socal_g <- group_dt[group == g & socal == T, zip]

  # PGE: Load data -------------------------------------------------------------
    # Set up cluster
    cl <- makeCluster(getOption("cl.cores", 4))
    clusterExport(cl, varlist = c("all_vars", "dir_pge"))
    clusterEvalQ(cl, library(data.table))
    # Load PGE data from common zips
    pge_dt <- parLapply(
      cl = cl,
      X = pge_g,
      fun = function(i) {
        # Load the dataset
        zip_dt <- readRDS(paste0(dir_pge, "prices_pge_", i, ".rds"))
        # Drop any variable not contained in all_vars
        zip_dt[, (base::setdiff(names(zip_dt), all_vars)) := NULL]
        # Return the dataset
        return(zip_dt)
        }) %>% rbindlist()
    # Kill cluster
    stopCluster(cl)
    # Clean up
    gc()

  # PGE: Flag extreme observations ---------------------------------------------
    # Drop observations outside of 2010-2015
    pge_dt <- pge_dt[between(year, 2010, 2015)]
    # For days in [28,34]:
    # Find the most extreme 1 percent of observations on therms
    thm_lo <- quantile(x = pge_dt[between(days, 28, 34), thm], probs = 0.005)
    thm_hi <- quantile(x = pge_dt[between(days, 28, 34), thm], probs = 0.995)
    # For days in [28,34]:
    # Find the most extreme 1 percent of observations on revenue
    rev_lo <- quantile(x = pge_dt[between(days, 28, 34), rev], probs = 0.005)
    rev_hi <- quantile(x = pge_dt[between(days, 28, 34), rev], probs = 0.995)
    # Flag observations outside of the central 99%
    pge_dt[, `:=`(
      flag_thm = 1 * !between(thm, thm_lo, thm_hi),
      flag_rev = 1 * !between(rev, rev_lo, rev_hi)
      )]
    # Extend flags to households
    pge_dt[, `:=`(
      flag_thm_hh = max(flag_thm, na.rm = T),
      flag_rev_hh = max(flag_rev, na.rm = T)
      ), by = hh_id]
    # Garbage control
    gc()

  # SoCal: Load data -----------------------------------------------------------
    # Set up cluster
    cl <- makeCluster(getOption("cl.cores", 4))
    clusterExport(cl, varlist = c("all_vars", "dir_socal"))
    clusterEvalQ(cl, library(data.table))
    # Load SoCal data from common zips
    socal_dt <- parLapply(
      cl = cl,
      X = socal_g,
      fun = function(i) {
        # Load the dataset
        zip_dt <- readRDS(paste0(dir_socal, "prices_socal_", i, ".rds"))
        # Drop any variable not contained in all_vars
        zip_dt[, (base::setdiff(names(zip_dt), all_vars)) := NULL]
        # Return the dataset
        return(zip_dt)
      }) %>% rbindlist()
    # Kill cluster
    stopCluster(cl)
    # Clean up
    gc()

  # SoCal: Flag extreme observations -------------------------------------------
    # Drop observations outside of 2010-2015
    socal_dt <- socal_dt[between(year, 2010, 2015)]
    # Find the most extreme 1 percent of observations on therms
    thm_lo <- quantile(x = socal_dt[between(days, 28, 34), thm], probs = 0.005)
    thm_hi <- quantile(x = socal_dt[between(days, 28, 34), thm], probs = 0.995)
    # Find the most extreme 1 percent of observations on revenue
    rev_lo <- quantile(x = socal_dt[between(days, 28, 34), rev], probs = 0.005)
    rev_hi <- quantile(x = socal_dt[between(days, 28, 34), rev], probs = 0.995)
    # Flag observations outside of the central 99%
    socal_dt[, `:=`(
      flag_thm = 1 * !between(thm, thm_lo, thm_hi),
      flag_rev = 1 * !between(rev, rev_lo, rev_hi)
      )]
    # Extend flags to households
    socal_dt[, `:=`(
      flag_thm_hh = max(flag_thm, na.rm = T),
      flag_rev_hh = max(flag_rev, na.rm = T)
      ), by = hh_id]
    # Garbage control
    gc()

  # Enforce time overlap -------------------------------------------------------
    # Find overlapping data range
    first_date <- max(min(pge_dt[,begin_date]), min(socal_dt[,begin_date]))
    last_date <- min(max(pge_dt[,begin_date]), max(socal_dt[,begin_date]))
    # Drop observations outside date range
    pge_dt <- pge_dt[between(begin_date, first_date, last_date)]
    socal_dt <- socal_dt[between(begin_date, first_date, last_date)]
    gc()

  # Join utility datasets ------------------------------------------------------
    # Drop columns from PGE data that are not in SoCal data
    pge_dt[, setdiff(names(pge_dt), names(socal_dt)) := NULL]
    # Drop columns from SoCal data that are not in PGE data
    socal_dt[, setdiff(names(socal_dt), names(pge_dt)) := NULL]
    # Join the two utilities' data into a single dataset
    bill_dt <- rbindlist(list(pge_dt, socal_dt), use.names = T, fill = T)
    gc()
    # Drop utilities' datasets
    rm(pge_dt, socal_dt, first_date, last_date)
    gc()

  # Calculate/add additional variables -----------------------------------------
    # Add week-of-year and week-of-year-by-year variables
    bill_dt[, week := week(begin_date)]
    bill_dt[, yw := paste0(year, str_pad(week, 2, "left", 0))]
    # Create zip by week-of-sample variable
    bill_dt[, zip_yw := paste0(zip, yw)]
    # Add zip-year variable
    bill_dt[, zip_year := paste0(zip, year)]
    # Add household-year variable
    bill_dt[, hh_year := paste0(hh_id, year)]
    # Add the city
    bill_dt %<>% merge(y = common_city_zips[, .(zip, city)],
      by = "zip", all.x = T, all.y = F, sort = F)
    # Add city-year, city-month, city-year-month, and city-year-week variables
    bill_dt[, `:=`(
      city_year  = paste0(city, year),
      city_month = paste0(city, month),
      city_ym    = paste0(city, ym),
      city_yw    = paste0(city, yw)
      )]
    # Add city-year-week-utility variable
    bill_dt[, city_yw_utility := paste0(city_yw, utility)]
    # Add zip-year, zip-month, zip-year-month, and zip-year-week variables
    bill_dt[, `:=`(
      zip_year  = paste0(zip, year),
      zip_month = paste0(zip, month),
      zip_ym    = paste0(zip, ym),
      zip_yw    = paste0(zip, yw)
      )]
    # Add zip-year-week-utility variable
    bill_dt[, zip_yw_utility := paste0(zip_yw, utility)]
    gc()
    # Find the first (integer) date in the sample
    day1 <- bill_dt[, begin_date] %>% unclass() %>% min()
    # Add variable for price cluster: a combination of billing cycle,
    # utility, and climate zone
    # NOTE: The utilities code their climate zone differently
    bill_dt[, price_cluster := paste0(
      unclass(begin_date) - day1, "_",
      unclass(end_date) - day1, "_",
      climate
      )]
    # Create variable for interaction between utility and month-of-sample
    bill_dt[, utility_ym := paste0(utility, "_", ym)]
    # Change HDD and CDD variables to thousands
    vars_hdd <- bill_dt %>% names() %>% grep("hdd", ., value = T)
    vars_cdd <- bill_dt %>% names() %>% grep("cdd", ., value = T)
    for (j in c(vars_cdd, vars_hdd)) {
      set(x = bill_dt, j = j, value = bill_dt[[j]] / 1e3)
    }
    rm(j, vars_hdd, vars_cdd); gc()

  # Season variables -----------------------------------------------------------
    # Create seasons
    bill_dt[between(month(begin_date), 4, 9), season := "summer"]
    bill_dt[!between(month(begin_date), 4, 9), season := "winter"]

  # Save the dataset -----------------------------------------------------------
    # Order columns alphabetically
    setcolorder(bill_dt, order(names(bill_dt)))
    # Save
    saveRDS(object = bill_dt,
      file = paste0(dir_rds, "Prices/Joint/zipsNeighbor", g, ".rds"))

  # Create subsets -------------------------------------------------------------
    # Sample household IDs
    hh_ids <- bill_dt[, hh_id] %>% unique()
    n_ids <- length(hh_ids)
    # Sample 1% of IDs
    set.seed(12345)
    sampled_ids <- hh_ids[sample(n_ids, size = floor(0.01 * n_ids))]
    # Create sampled dataset
    sub_dt <- bill_dt[hh_id %in% sampled_ids]
    # Save
    saveRDS(object = sub_dt,
      file = paste0(dir_rds,
        "Prices/Joint/zipsNeighbor", g, "_01PercentSample.rds"))
    # Sample 10% of IDs
    set.seed(12345)
    sampled_ids <- hh_ids[sample(n_ids, size = floor(0.10 * n_ids))]
    # Create sampled dataset
    sub_dt <- bill_dt[hh_id %in% sampled_ids]
    # Save
    saveRDS(object = sub_dt,
      file = paste0(dir_rds,
        "Prices/Joint/zipsNeighbor", g, "_10PercentSample.rds"))

  }

# Clean up ---------------------------------------------------------------------
  # Remove all objects from memory
  rm(list = ls())
  # Garbage control
  gc()
